import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("cleandata.csv")
df.head()
Unnamed: 0 | SeniorCitizen | MonthlyCharges | TotalCharges | Churn | gender_Male | Partner_Yes | Dependents_Yes | PhoneService_Yes | MultipleLines_No phone service | ... | PaperlessBilling_Yes | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | tenure_group_12 - 23 | tenure_group_24 - 35 | tenure_group_36 - 47 | tenure_group_48 - 59 | tenure_group_60 - 71 | tenure_group_72 - 72 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 29.85 | 29.85 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 56.95 | 1889.50 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 2 | 0 | 53.85 | 108.15 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 3 | 0 | 42.30 | 1840.75 | 0 | 1 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 4 | 0 | 70.70 | 151.65 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 37 columns
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()
SeniorCitizen | MonthlyCharges | TotalCharges | Churn | gender_Male | Partner_Yes | Dependents_Yes | PhoneService_Yes | MultipleLines_No phone service | MultipleLines_Yes | ... | PaperlessBilling_Yes | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | tenure_group_12 - 23 | tenure_group_24 - 35 | tenure_group_36 - 47 | tenure_group_48 - 59 | tenure_group_60 - 71 | tenure_group_72 - 72 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 29.85 | 29.85 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 56.95 | 1889.50 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 0 | 53.85 | 108.15 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 42.30 | 1840.75 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 0 | 70.70 | 151.65 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 36 columns
x = df.drop('Churn', axis=1)
y = df['Churn']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
model = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)
model.fit(x_train,y_train)
DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)
predict = model.predict(x_test)
predict
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# Print accuracy score
accuracy = accuracy_score(y_test, predict)
print("Accuracy:", accuracy)
# Print classification report
print("Classification Report:\n", classification_report(y_test, predict))
# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, predict))
Accuracy: 0.7742546142924751 Classification Report: precision recall f1-score support 0 0.83 0.87 0.85 1564 1 0.57 0.51 0.54 549 accuracy 0.77 2113 macro avg 0.70 0.69 0.69 2113 weighted avg 0.77 0.77 0.77 2113 Confusion Matrix: [[1358 206] [ 271 278]]
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
model_rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=6, min_samples_leaf=8)
model_rf.fit(x_train, y_train)
RandomForestClassifier(max_depth=6, min_samples_leaf=8)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=6, min_samples_leaf=8)
model_gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
model_gbm.fit(x_train, y_train)
GradientBoostingClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingClassifier()
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(x_train, y_train)
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier()
model_svm = SVC(kernel='rbf', C=1.0, probability=True)
model_svm.fit(x_train, y_train)
SVC(probability=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(probability=True)
models = [model_rf, model_gbm, model_knn, model_svm]
model_names = ['Random Forest', 'GBM', 'KNN', 'SVM']
for model, name in zip(models, model_names):
y_pred = model.predict(x_test)
accuracy = model.score(x_test, y_test)
print(f"Classifier: {name}")
print(f"Accuracy: {accuracy:.2f}")
print(metrics.classification_report(y_test, y_pred))
print("------------")
Classifier: Random Forest Accuracy: 0.79 precision recall f1-score support 0 0.81 0.93 0.87 1564 1 0.68 0.40 0.50 549 accuracy 0.79 2113 macro avg 0.75 0.66 0.68 2113 weighted avg 0.78 0.79 0.77 2113 ------------ Classifier: GBM Accuracy: 0.80 precision recall f1-score support 0 0.84 0.91 0.87 1564 1 0.66 0.49 0.56 549 accuracy 0.80 2113 macro avg 0.75 0.70 0.72 2113 weighted avg 0.79 0.80 0.79 2113 ------------ Classifier: KNN Accuracy: 0.77 precision recall f1-score support 0 0.82 0.89 0.85 1564 1 0.58 0.44 0.50 549 accuracy 0.77 2113 macro avg 0.70 0.66 0.68 2113 weighted avg 0.76 0.77 0.76 2113 ------------ Classifier: SVM Accuracy: 0.74 precision recall f1-score support 0 0.74 1.00 0.85 1564 1 0.00 0.00 0.00 549 accuracy 0.74 2113 macro avg 0.37 0.50 0.43 2113 weighted avg 0.55 0.74 0.63 2113 ------------
sm = SMOTEENN()
x_resampled, y_resampled = sm.fit_resample(x_train, y_train)
xr_train,xr_test,yr_train,yr_test=train_test_split(x_resampled, y_resampled,test_size=0.3)
model_smote = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)
model_smote.fit(xr_train,yr_train)
yr_pred_smote = model_smote.predict(xr_test)
model_score_r = model_smote.score(xr_test, yr_test)
print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_pred_smote))
print(metrics.confusion_matrix(yr_test, yr_pred_smote))
0.9 precision recall f1-score support 0 0.86 0.95 0.90 585 1 0.95 0.85 0.90 632 accuracy 0.90 1217 macro avg 0.90 0.90 0.90 1217 weighted avg 0.90 0.90 0.90 1217 [[554 31] [ 93 539]]
as we have seen random forest did a better perform above even without smoteenn, lets's see how it performed after smoteen
model_rf_smote = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=6, min_samples_leaf=8)
model_rf_smote.fit(xr_train,yr_train)
yr_pred_smote = model_rf_smote.predict(xr_test)
model_score_r = model_rf_smote.score(xr_test, yr_test)
print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_pred_smote))
print(metrics.confusion_matrix(yr_test, yr_pred_smote))
0.94 precision recall f1-score support 0 0.96 0.92 0.94 585 1 0.93 0.96 0.94 632 accuracy 0.94 1217 macro avg 0.94 0.94 0.94 1217 weighted avg 0.94 0.94 0.94 1217 [[537 48] [ 25 607]]
model_gbm_smote = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)
model_gbm_smote.fit(xr_train, yr_train)
yr_pred_smote = model_gbm_smote.predict(xr_test)
model_score_r = model_gbm_smote.score(xr_test, yr_test)
print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_pred_smote))
print(metrics.confusion_matrix(yr_test, yr_pred_smote))
0.96 precision recall f1-score support 0 0.96 0.95 0.96 585 1 0.96 0.96 0.96 632 accuracy 0.96 1217 macro avg 0.96 0.96 0.96 1217 weighted avg 0.96 0.96 0.96 1217 [[558 27] [ 25 607]]
model_knn_smote = KNeighborsClassifier(n_neighbors=5)
model_knn_smote.fit(xr_train,yr_train)
yr_pred_smote = model_knn_smote.predict(xr_test)
model_score_r = model_knn_smote.score(xr_test, yr_test)
print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_pred_smote))
print(metrics.confusion_matrix(yr_test, yr_pred_smote))
0.95 precision recall f1-score support 0 0.94 0.95 0.95 585 1 0.96 0.94 0.95 632 accuracy 0.95 1217 macro avg 0.95 0.95 0.95 1217 weighted avg 0.95 0.95 0.95 1217 [[558 27] [ 36 596]]
import pickle
knnmodel = 'knnchurnmodel.sav'
pickle.dump(model_knn_smote, open(knnmodel, 'wb'))
rfmodel = 'rfchurnmodel.sav'
pickle.dump(model_rf_smote, open(rfmodel, 'wb'))
gbm_model = 'gbmchurnmodel.sav'
pickle.dump(model_gbm_smote, open(gbm_model, 'wb'))
load_model = pickle.load(open(gbm_model, 'rb'))
gbm_model_score = load_model.score(xr_test, yr_test)
gbm_model_score
0.9572719802793755
load_model = pickle.load(open(rfmodel, 'rb'))
rf_model_score = load_model.score(xr_test, yr_test)
rf_model_score
0.9400164338537387
load_model = pickle.load(open(knnmodel, 'rb'))
knn_model_score = load_model.score(xr_test, yr_test)
knn_model_score
0.9482333607230896
I have saved three of the model "rfchurnmodel.sav" (Random forest), gbmchurnmodel.sav (Gradient Boosting) and "knnchurnmodel.sav" (K-nearest neighbours)
Now, I will use Knnchurnmodel.sav as my final model, to create APIs for accessing the model from the UI.
With this implementation, users can efficiently utilize the predictive power of the model through the user interface.